package com;


import com.sun.webkit.WebPage;
import javafx.scene.web.WebEngine;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import java.util.ArrayList;
import java.util.List;
import java.util.Vector;

public class WanhoPageProcessor implements PageProcessor {

    //部分一：抓取网站的相关配置，包括编码，抓取间隔，重试次数等
    private Site site=Site.me().
            setTimeOut(1000).
            setRetryTimes(3).
            setSleepTime(1000).
            setCharset("UTF-8");
    //获取粘点
    public Site getSite() {
        return site;
    }
    //爬取过程
    public void process(Page page) {

        //获取当前页的所有喜报
        List<String> list=page.getHtml().xpath
                ("//div[@class='main_l']/ul/li")
                .all();
        //要保存喜报的集合
        Vector<PageVo> pageVoList=new Vector<PageVo>();
        //遍历喜报
        String title;
        String content;
        String img;
        for(String item:list)
        {
            Html tmp=Html.create(item);
           //拿到标题
            title=tmp.xpath("//div[@class='content']/h4/a/text()").toString();
            //内容
           content=tmp.xpath("//div[@class='content']/p/text()").toString();
            //图片路径
            img=tmp.xpath("//a/img/@src").toString();
            //加入结合
            PageVo vo=new PageVo(title,content,img);
            pageVoList.add(vo);
            System.out.println(vo);
        }

        //保存数据至page中，后续进行持续化
        page.putField("e_list",pageVoList);


        //获得其他页
        page.addTargetRequests(getothersUrls());


    }

    public List<String> getothersUrls(){
        List<String> urllist=new ArrayList<String>();
        for(int i=2;i<5;i++){
            urllist.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html");
        }
        return urllist;
    }

}
